import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree
from xgboost import XGBRegressor
# ensure openpyxl==3.1.0, version 3.1.1 has a bug
MajorRoadsEmissionsURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/8c520de2-c518-4e64-932c-0071ac826742/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
SupportingDocURL = 'https://data.london.gov.uk/download/london-atmospheric-emissions-inventory-2013/1f3755f3-dae8-4d39-b9b4-0cc7adb4e826/1.%20Supporting%20Information.zip'
LocalFileMajorRoadsEmissions = 'data/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx'
LocalFileSupportingDoc = 'data/1. Supporting Information.zip'
# Check if data folder exists, if not create it
if not os.path.exists('data'):
os.makedirs('data')
# Check if data is already downloaded, if not set load_from_local to False
if os.path.exists(LocalFileMajorRoadsEmissions):
load_from_local = True
else:
load_from_local = False
if load_from_local:
MajorRoadsEmissionsURL = LocalFileMajorRoadsEmissions
xls = pd.ExcelFile(MajorRoadsEmissionsURL)
if load_from_local:
DocZip = ZipFile(LocalFileSupportingDoc)
else:
resp = urlopen(SupportingDocURL)
DocZip = ZipFile(BytesIO(resp.read()))
DocZip.namelist()[:10]
['1. Supporting Information/', '1. Supporting Information/1. Road Traffic Data/', '1. Supporting Information/1. Road Traffic Data/Excel/', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2008_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2010_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2020_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2025_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2030_AADT-VKM.xlsx', '1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_VKM_AllYears_Summary.xlsx']
xls.sheet_names
['2013 LTS Rds', '2013 Other Major Rds']
df_LTSroads = pd.read_excel(xls, '2013 LTS Rds')
print(df_LTSroads.shape)
df_LTSroads.head()
(366220, 32)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | Lts | Length (m) | Emissions | Year | Pollutant | ... | Artic5Axle | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898 | 50.761449 | DFT | 2013 | CO2 | ... | 0.241372 | 0.190560 | 8.761443 | 4.810774 | 0.037550 | 1.735121 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.015535 | 0.008576 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816 | 5.101391 | DFT | 2013 | CO2 | ... | 0.027271 | 0.021509 | 0.939028 | 0.518684 | 0.004055 | 0.184415 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816 | 3.757501 | DFT | 2013 | CO2 | ... | 0.020087 | 0.015843 | 0.691654 | 0.382044 | 0.002987 | 0.135834 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816 | 1.624593 | DFT | 2013 | CO2 | ... | 0.008685 | 0.006850 | 0.299044 | 0.165180 | 0.001292 | 0.058729 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 32 columns
df_OtherMajorRoads = pd.read_excel(xls, '2013 Other Major Rds')
print(df_OtherMajorRoads.shape)
df_OtherMajorRoads.head()
(513740, 32)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | DotRef | Length (m) | Emissions | Year | Pollutant | ... | Artic5Axle | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5911 | 4000000027989878 | 2 | External | NonGLA | 28440 | 9.714495 | DFT | 2013 | CO2 | ... | 3.006694 | 12.549219 | 18.791658 | 19.630267 | 0.279151 | 11.005820 | 0.000000 | 0.744254 | 0.0 | 0.0 |
| 1 | 5911 | 4000000027989880 | 2 | External | NonGLA | 28440 | 0.000000 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
| 2 | 5911 | 4000000027989882 | 2 | External | NonGLA | 57226 | 8.577192 | DFT | 2013 | CO2 | ... | 0.760333 | 2.446611 | 19.478135 | 10.300493 | 0.120149 | 7.734197 | 0.754408 | 0.868990 | 0.0 | 0.0 |
| 3 | 5911 | 4000000028014332 | 2 | External | NonGLA | 57226 | 9.347936 | DFT | 2013 | CO2 | ... | 0.823130 | 2.648621 | 20.173154 | 10.553940 | 0.123945 | 7.418739 | 0.820669 | 0.897038 | 0.0 | 0.0 |
| 4 | 5911 | 4000000027888882 | 2 | External | NonGLA | 28440 | 0.000000 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 |
5 rows × 32 columns
# concat the two dataframes
df_RoadEmissions = pd.concat([df_LTSroads, df_OtherMajorRoads], ignore_index=True)
print(df_RoadEmissions.shape)
df_RoadEmissions.head()
(879960, 33)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | Lts | Length (m) | Emissions | Year | Pollutant | ... | Artic6Axle | PetrolCar | DieselCar | PetrolLgv | DieselLgv | LtBus | Coach | ElectricCar | ElectricLgv | DotRef | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 0.190560 | 8.761443 | 4.810774 | 0.037550 | 1.735121 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.015535 | 0.008576 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 0.021509 | 0.939028 | 0.518684 | 0.004055 | 0.184415 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 0.015843 | 0.691654 | 0.382044 | 0.002987 | 0.135834 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 0.006850 | 0.299044 | 0.165180 | 0.001292 | 0.058729 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
5 rows × 33 columns
traffic_excel = DocZip.read('1. Supporting Information/1. Road Traffic Data/Excel/LAEI2013_2013_AADT-VKM.xlsx')
traffic_xls = pd.ExcelFile(BytesIO(traffic_excel))
df_AADT = pd.read_excel(traffic_xls, 'MajorGrid_AADTandVKM_2013')
print(df_AADT.shape)
df_AADT.head()
(87999, 44)
| RowID | Year | Toid | GRID_ExactCut_ID | Location_ExactCut | BoroughName_ExactCut | TLRN | MotorwayNumber | AADT Motorcycle | AADT Taxi | ... | VKM_Coach | VKM_Rigid2Axle | VKM_Rigid3Axle | VKM_Rigid4Axle | VKM_Artic3Axle | VKM_Artic5Axle | VKM_Artic6Axle | VKM_ElectricCar | VKM_ElectricLgv | VKM_TOTAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 2013.0 | 4.000000e+15 | 836.0 | Outer | Hillingdon | Other | Other | 88.301916 | 77.112580 | ... | 149.248696 | 293.680300 | 55.978941 | 39.030966 | 16.191367 | 10.970609 | 3.993946 | 4.335614 | 1.235289 | 16605.011414 |
| 1 | 2.0 | 2013.0 | 4.000000e+15 | 2217.0 | Outer | Hillingdon | Other | Other | 88.301916 | 77.112580 | ... | 98.338925 | 193.503902 | 36.884134 | 25.717231 | 10.668379 | 7.228458 | 2.631583 | 2.856706 | 0.813924 | 10940.494996 |
| 2 | 3.0 | 2013.0 | 4.000000e+15 | 282.0 | External | NonGLA | Other | Other | 310.363572 | 100.322495 | ... | 1657.075319 | 12950.212101 | 3011.364039 | 2861.551314 | 1710.809301 | 1966.897025 | 1647.110606 | 221.806380 | 47.635028 | 796735.125068 |
| 3 | 4.0 | 2013.0 | 4.000000e+15 | 873.0 | Outer | Hillingdon | Other | Other | 39.473081 | 144.548284 | ... | 118.008843 | 9777.985094 | 2051.227418 | 1024.275647 | 470.758531 | 815.631678 | 1959.389833 | 78.775616 | 15.287825 | 284144.265992 |
| 4 | 5.0 | 2013.0 | 4.000000e+15 | 2930.0 | Outer | Hillingdon | Other | Other | 39.473081 | 144.548284 | ... | 401.216526 | 33244.027352 | 6973.937855 | 3482.419671 | 1600.524988 | 2773.054115 | 6661.700602 | 267.828056 | 51.976850 | 966057.900401 |
5 rows × 44 columns
# left join df_RoadEmissions and df_AADT on the ['Toid' and 'GRID_ExactCut_ID'] column
df = pd.merge(df_RoadEmissions, df_AADT, how='left',
left_on=['Toid', 'GRID_ExactCut_ID'],
right_on=['Toid', 'GRID_ExactCut_ID'])
print(df.shape)
df.head()
(879960, 75)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut_x | BoroughName_ExactCut_x | Lts | Length (m)_x | Emissions | Year_x | Pollutant | ... | VKM_Coach | VKM_Rigid2Axle | VKM_Rigid3Axle | VKM_Rigid4Axle | VKM_Artic3Axle | VKM_Artic5Axle | VKM_Artic6Axle | VKM_ElectricCar | VKM_ElectricLgv | VKM_TOTAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 0.0 | 2383.880434 | 229.558857 | 335.509098 | 194.242109 | 247.217230 | 176.583736 | 28.990862 | 5.460174 | 104036.993985 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.046589 | 0.000000 | 140.050860 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 0.0 | 239.573680 | 23.070058 | 33.717777 | 19.520818 | 24.844678 | 17.746199 | 2.913505 | 0.548733 | 10455.442789 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 0.0 | 176.461358 | 16.992575 | 24.835302 | 14.378333 | 18.299696 | 13.071212 | 2.145983 | 0.404177 | 7701.103189 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 0.0 | 76.294807 | 7.346907 | 10.737788 | 6.216614 | 7.912054 | 5.651467 | 0.927837 | 0.174750 | 3329.647849 |
5 rows × 75 columns
# remove unnecessary columns
columnstokeep = df.columns
for a in columnstokeep:
if 'VKM' in a:
columnstokeep = columnstokeep.drop(a)
df = df[columnstokeep]
print(df.shape)
df.head()
(879960, 58)
| GridId | Toid | GRID_ExactCut_ID | Location_ExactCut_x | BoroughName_ExactCut_x | Lts | Length (m)_x | Emissions | Year_x | Pollutant | ... | AADT Rigid3Axle | AADT Rigid4Axle | AADT Artic3Axle | AADT Artic5Axle | AADT Artic6Axle | AADT ElectricCar | AADT ElectricLgv | AADT TOTAL | Speed (kph) | Length (m)_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6253 | 4000000027908919 | 24 | External | NonGLA | 18898.0 | 50.761449 | DFT | 2013 | CO2 | ... | 12.389882 | 18.108290 | 10.483747 | 13.342950 | 9.530679 | 1.564711 | 0.29470 | 5615.144325 | 42.269546 | 50.761449 |
| 1 | 6253 | 4000000027947931 | 24 | External | NonGLA | 18895.0 | 28.592125 | DFT | 2013 | CO2 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.004464 | 0.00000 | 13.419814 | 32.236053 | 28.592125 |
| 2 | 6253 | 4000000028013383 | 24 | External | NonGLA | 15816.0 | 5.101391 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 10.202783 |
| 3 | 6253 | 4000000028025820 | 24 | External | NonGLA | 15816.0 | 3.757501 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 7.515003 |
| 4 | 6253 | 4000000028029388 | 24 | External | NonGLA | 15816.0 | 1.624593 | DFT | 2013 | CO2 | ... | 6.194941 | 9.054145 | 5.241873 | 6.671475 | 4.765339 | 0.782356 | 0.14735 | 2807.572163 | 35.051885 | 3.249186 |
5 rows × 58 columns
df.columns
Index(['GridId', 'Toid', 'GRID_ExactCut_ID', 'Location_ExactCut_x',
'BoroughName_ExactCut_x', 'Lts', 'Length (m)_x', 'Emissions', 'Year_x',
'Pollutant', 'Emissions Unit', 'Motorcycle', 'Taxi', 'Car',
'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
'ElectricLgv', 'DotRef', 'RowID', 'Year_y', 'Location_ExactCut_y',
'BoroughName_ExactCut_y', 'TLRN', 'MotorwayNumber', 'AADT Motorcycle',
'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv',
'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle',
'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle',
'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL',
'Speed (kph)', 'Length (m)_y'],
dtype='object')
label = []
columnstokeep = df.columns
for a in columnstokeep:
if 'AADT' in a:
label = label + [a]
elif 'Speed' in a:
label = label + [a]
elif 'Length' in a:
label = label + [a]
label
['Length (m)_x', 'AADT Motorcycle', 'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv', 'AADT TOTAL', 'Speed (kph)', 'Length (m)_y']
# remove irrelevant columns
label.remove('AADT TOTAL')
label.remove('Speed (kph)')
label.remove('Length (m)_y')
label
['Length (m)_x', 'AADT Motorcycle', 'AADT Taxi', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT LtBus', 'AADT Coach', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Rigid4Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle', 'AADT ElectricCar', 'AADT ElectricLgv']
target = ['Motorcycle', 'Taxi', 'Car',
'BusAndCoach', 'Lgv', 'Rigid', 'Artic', 'Rigid2Axle', 'Rigid3Axle',
'Rigid4Axle', 'Artic3Axle', 'Artic5Axle', 'Artic6Axle', 'PetrolCar',
'DieselCar', 'PetrolLgv', 'DieselLgv', 'LtBus', 'Coach', 'ElectricCar',
'ElectricLgv']
# save plot to visuals folder
if not os.path.exists('visuals'):
os.makedirs('visuals')
# plot the correlation matrix
corr = df[label + target].corr()
plt.figure(figsize=(30, 30))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.savefig('visuals/main_correlation_matrix.png')
# sum the target variables to one column
df['Total Emissions'] = df[target].sum(axis=1)
# Split df by unique 'Pollutant' values into dictionary
df_dict = {k: v for k, v in df.groupby('Pollutant')}
df_dict.keys()
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
for k, v in df_dict.items():
print(k, v.shape)
CO2 (87996, 59) NOx (87996, 59) PM10_Brake (87996, 59) PM10_Exhaust (87996, 59) PM10_Resusp (87996, 59) PM10_Tyre (87996, 59) PM25_Brake (87996, 59) PM25_Exhaust (87996, 59) PM25_Resusp (87996, 59) PM25_Tyre (87996, 59)
from sklearn.model_selection import train_test_split
train_test_set = {}
for k, v in df_dict.items():
x_train, x_test, y_train, y_test = train_test_split(v[label], v['Total Emissions'], test_size=0.2, random_state=42)
# drop indexes
x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
train_test_set[k] = {'x_train': x_train, 'x_test': x_test, 'y_train': y_train, 'y_test': y_test}
train_test_set.keys()
dict_keys(['CO2', 'NOx', 'PM10_Brake', 'PM10_Exhaust', 'PM10_Resusp', 'PM10_Tyre', 'PM25_Brake', 'PM25_Exhaust', 'PM25_Resusp', 'PM25_Tyre'])
# feature selection
for k, v in train_test_set.items():
selector = SelectKBest(f_regression, k=10)
selector.fit(v['x_train'], v['y_train'])
v['x_train'] = selector.transform(v['x_train'])
v['x_test'] = selector.transform(v['x_test'])
v['selected_features'] = [label[i] for i in selector.get_support(indices=True)]
v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
v['selector'] = selector
print(k, v['selected_features'])
CO2 ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] NOx ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM10_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Brake ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Exhaust ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Resusp ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle'] PM25_Tyre ['Length (m)_x', 'AADT Pcar', 'AADT Dcar', 'AADT PLgv', 'AADT DLgv', 'AADT Rigid2Axle', 'AADT Rigid3Axle', 'AADT Artic3Axle', 'AADT Artic5Axle', 'AADT Artic6Axle']
# create a table using dataframe for the selected features
df_selected_features = pd.DataFrame(columns=['Pollutant', 'Feature', 'Score', 'Pvalue'])
for k, v in train_test_set.items():
for i in range(len(v['selected_features'])):
# using concat to append a row to the dataframe
df_selected_features = pd.concat([df_selected_features,
pd.DataFrame([[k, v['selected_features'][i],
v['selector'].scores_[i],
v['selector'].pvalues_[i]]],
columns=['Pollutant', 'Feature', 'Score', 'Pvalue'])],
ignore_index=True)
df_selected_features = df_selected_features.sort_values(by=['Pollutant', 'Score'], ascending=False)
df_selected_features.head()
| Pollutant | Feature | Score | Pvalue | |
|---|---|---|---|---|
| 94 | PM25_Tyre | AADT DLgv | 44730.395889 | 0.0 |
| 95 | PM25_Tyre | AADT Rigid2Axle | 43381.929381 | 0.0 |
| 96 | PM25_Tyre | AADT Rigid3Axle | 30302.210535 | 0.0 |
| 90 | PM25_Tyre | Length (m)_x | 27354.984412 | 0.0 |
| 93 | PM25_Tyre | AADT PLgv | 24193.121504 | 0.0 |
# create output folder if it does not exist
if not os.path.exists('features'):
os.makedirs('features')
# save df_selected_features to csv in features folder
df_selected_features.to_csv('features/df_selected_features.csv', index=False)
# create output folder if it does not exist
if not os.path.exists('output'):
os.makedirs('output')
# output the training and testing sets to csv files in the output folder, with the target on the left
for k, v in train_test_set.items():
df_train_output = pd.concat([v['y_train'], v['x_train']], axis=1)
df_test_output = pd.concat([v['y_test'], v['x_test']], axis=1)
df_train_output.to_csv('output/' + k + '_train.csv', index=False)
df_test_output.to_csv('output/' + k + '_test.csv', index=False)
# visualize correlation matrix of selected features
for k, v in train_test_set.items():
corr = pd.DataFrame(v['x_train'], columns=v['selected_features']).corr()
# set title of plt to k
print(k)
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.savefig('visuals/' + k + '_correlation_matrix.png')
CO2 NOx PM10_Brake PM10_Exhaust PM10_Resusp PM10_Tyre PM25_Brake PM25_Exhaust PM25_Resusp PM25_Tyre
# if train_test_set is not initialized, load the csv files in the output folder
try:
train_test_set
except NameError:
train_test_set = {}
for file in os.listdir('output'):
if file.endswith('_train.csv'):
pollutant = file[:file.rfind('_')]
df_train = pd.read_csv('output/' + file)
df_test = pd.read_csv('output/' + pollutant + '_test.csv')
train_test_set[pollutant] = {'x_train': df_train.iloc[:, 1:],
'x_test': df_test.iloc[:, 1:],
'y_train': df_train.iloc[:, 0],
'y_test': df_test.iloc[:, 0]}
train_test_set[pollutant]['selected_features'] = list(train_test_set[pollutant]['x_train'].columns)
# stardaize the x_train and x_test
for k, v in train_test_set.items():
# initialize the scaler
scaler = StandardScaler()
# fit the scaler to the x_train
scaler.fit(v['x_train'])
# transform the x_train and x_test
v['x_train'] = scaler.transform(v['x_train'])
v['x_test'] = scaler.transform(v['x_test'])
v['x_train'] = pd.DataFrame(v['x_train'], columns=v['selected_features'])
v['x_test'] = pd.DataFrame(v['x_test'], columns=v['selected_features'])
# save the scaler
v['scaler'] = scaler
model_dict = {}
def train_eval_model(model, x_train, y_train, x_test, y_test):
# train the model
model.fit(x_train, y_train)
# predict the y_test
y_pred = model.predict(x_test)
# evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
return {'model': model, 'mse': mse, 'rmse': rmse, 'r2': r2}
def hyperparameter_tuning(model, param_grid, pollutant, model_dict, x_train, y_train, x_test, y_test):
# initialize the grid search
grid_search = GridSearchCV(model, param_grid, cv=5)
# fit the grid search to the training data
grid_search.fit(x_train, y_train)
# save the best model
model_dict[pollutant]['best_model'] = grid_search.best_estimator_
# predict the y_test
y_pred = grid_search.best_estimator_.predict(x_test)
# evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
model_dict[pollutant]['best_mse'] = mse
model_dict[pollutant]['best_rmse'] = rmse
model_dict[pollutant]['best_r2'] = r2
model_dict[pollutant]['best_params'] = grid_search.best_params_
model_dict[pollutant]['best_score'] = grid_search.best_score_
return model_dict
# model dict for each pollutant
LR_dict = {}
for k, v in train_test_set.items():
LR_dict[k] = train_eval_model(
LinearRegression(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, LR_dict[k]['mse'], LR_dict[k]['r2'], LR_dict[k]['rmse'])
CO2 156343.23707624507 0.6228061024706681 395.40262654191497 NOx 1.3337240174376739 0.5887914721059816 1.1548696971683317 PM10_Brake 0.0020649961618088935 0.5819056209536458 0.04544222883848121 PM10_Exhaust 0.0008067369355814341 0.5964592889324113 0.02840311489223381 PM10_Resusp 0.007830744110917233 0.5904504796613461 0.08849149174308925 PM10_Tyre 0.0002890044223154252 0.650662925495141 0.017000130067603165 PM25_Brake 0.00035156196710148816 0.5665416951804979 0.018749985789367634 PM25_Exhaust 0.0005769161710396388 0.5835788811077669 0.024019079312905374 PM25_Resusp 1.0696322799024043e-05 0.5908667099465899 0.0032705233218896395 PM25_Tyre 0.000154826853665266 0.6211541868187213 0.01244294393080938
# add LR_dict to model_dict
model_dict['LR'] = LR_dict
# model dict for each pollutant
SVR_dict = {}
for k, v in train_test_set.items():
SVR_dict[k] = train_eval_model(SVR(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, SVR_dict[k]['mse'], SVR_dict[k]['r2'], SVR_dict[k]['rmse'])
CO2 205750.76003393464 0.5036054481910563 453.59757498683194 NOx 1.522708912356757 0.5305243947962537 1.2339809205805239 PM10_Brake 0.003126316147087352 0.36702293573558564 0.055913470175686215 PM10_Exhaust 0.004618347885348237 -1.310159988234227 0.06795842762563181 PM10_Resusp 0.010696433148599308 0.4405743562433685 0.10342356186381954 PM10_Tyre 0.005143767056453705 -5.217581450967644 0.07172006034892682 PM25_Brake 0.0008522858713561948 -0.050825810497931556 0.029193935523601385
model_dict['SVR'] = SVR_dict
# model dict for each pollutant
DTR_dict = {}
for k, v in train_test_set.items():
DTR_dict[k] = train_eval_model(DecisionTreeRegressor(), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, DTR_dict[k]['mse'], DTR_dict[k]['r2'], DTR_dict[k]['rmse'])
CO2 10929.241735276239 0.9736320971455986 104.54301380425302 NOx 0.15603487488324774 0.9518919428892522 0.39501249965443846 PM10_Brake 0.0015277829007523578 0.6906737867017841 0.039086863531784664 PM10_Exhaust 5.574784489590433e-05 0.9721141750457166 0.007466447943694801 PM10_Resusp 0.00029638916872755665 0.984498785790138 0.017215956805462677 PM10_Tyre 1.63434706054287e-05 0.9802446614386915 0.0040427058519546906 PM25_Brake 0.0001740118539670192 0.7854520957971861 0.013191355274080795 PM25_Exhaust 4.200376590464508e-05 0.969681461408545 0.0064810312377464345 PM25_Resusp 5.013962371571591e-07 0.9808216434766511 0.0007080933816645649 PM25_Tyre 8.485112877972075e-06 0.9792377781238125 0.00291292170817756
for k, v in DTR_set.items():
fig, ax = plt.subplots(figsize=(120, 12))
plot_tree(v['model'],
ax=ax,
feature_names=v['x_train'].columns,
fontsize=12,
filled=True)
plt.savefig('visuals/' + k + '_DTR.png')
param_grid = {'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, None]}
for k, v in train_test_set.items():
DTR_dict = hyperparameter_tuning(DecisionTreeRegressor(), param_grid, k, DTR_dict, v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, DTR_dict[k]['best_params'], DTR_dict[k]['best_score'], DTR_dict[k]['best_mse'], DTR_dict[k]['best_r2'], DTR_dict[k]['best_rmse'])
CO2 {'max_depth': 11} 0.9693905771178665 10025.28718149829 0.9758129790801501 100.12635607819897
NOx {'max_depth': 11} 0.9529703330773728 0.15228318467905627 0.9530486492136564 0.3902347814829635
PM10_Brake {'max_depth': 9} 0.7767798587673086 0.000969543140267805 0.8036991328672336 0.031137487700002468
PM10_Exhaust {'max_depth': 11} 0.9704734523661248 6.228027933376792e-05 0.9688465631120223 0.007891785560553956
PM10_Resusp {'max_depth': 11} 0.9789643667238573 0.00027970667933761894 0.9853712834009561 0.016724433602894266
PM10_Tyre {'max_depth': None} 0.9726982625556234 1.7479963339835284e-05 0.9788709141311146 0.004180904607837314
PM25_Brake {'max_depth': 9} 0.7848551706908675 0.00015020658685994454 0.8148027983520767 0.01225587968527533
PM25_Exhaust {'max_depth': 9} 0.9711781899903531 4.5595462543963925e-05 0.9670889559314124 0.006752441228471665
PM25_Resusp {'max_depth': None} 0.9770537730977109 4.436477558612973e-07 0.9830305171795202 0.0006660688822196225
PM25_Tyre {'max_depth': 12} 0.9722327091955691 8.635630736521929e-06 0.9788694759903602 0.0029386443705426367
model_dict['DTR'] = DTR_dict
RFR_dict = {}
for k, v in train_test_set.items():
RFR_dict[k] = train_eval_model(RandomForestRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, RFR_dict[k]['mse'], RFR_dict[k]['r2'], RFR_dict[k]['rmse'])
CO2 5182.449846763348 0.9874968147455099 71.98923424209588 NOx 0.07964791419497166 0.9754432692838015 0.2822196204996592 PM10_Brake 0.0006345126966994198 0.8715318716664411 0.02518953546017512 PM10_Exhaust 2.9831714937173756e-05 0.98507777327756 0.005461841716598327 PM10_Resusp 0.00012846486411468864 0.9932812613037385 0.01133423416533683 PM10_Tyre 1.0139446926080236e-05 0.9877438390116112 0.0031842498215561284 PM25_Brake 0.00010635037179891613 0.8688753159024897 0.010312631662137271 PM25_Exhaust 2.6157647972381708e-05 0.9811192724644566 0.005114454806954668 PM25_Resusp 2.0064062064047253e-07 0.9923255160877825 0.00044792925852245075 PM25_Tyre 4.621304177130512e-06 0.9886921312582624 0.0021497218836701907
param_grid = {'n_estimators': [120, 150, 200]}
for k, v in train_test_set.items():
RFR_dict = hyperparameter_tuning(RandomForestRegressor(random_state=42), param_grid, k, RFR_dict, v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, RFR_dict[k]['best_params'], RFR_dict[k]['best_mse'], RFR_dict[k]['best_r2'], RFR_dict[k]['best_rmse'])
CO2 {'n_estimators': 200} 5181.916028565403 0.98749810263599 71.98552652141542
NOx {'n_estimators': 200} 0.07917984362339837 0.9755875829560235 0.2813891320278706
PM10_Brake {'n_estimators': 120} 0.0006479354559862355 0.86881420065431 0.02545457632698363
PM10_Exhaust {'n_estimators': 150} 3.0021769435629244e-05 0.9849827054505327 0.005479212483161175
PM10_Resusp {'n_estimators': 150} 0.00012748472250863533 0.9933325229104182 0.011290913271681585
PM10_Tyre {'n_estimators': 200} 9.870728996230191e-06 0.9880686545792372 0.0031417716333671025
PM25_Brake {'n_estimators': 200} 0.00010477083756667516 0.8708228025329586 0.010235762676355639
PM25_Exhaust {'n_estimators': 200} 2.6032444649002596e-05 0.9812096448801173 0.005102199981282838
PM25_Resusp {'n_estimators': 200} 1.982879971088954e-07 0.9924155037053795 0.0004452954043204302
PM25_Tyre {'n_estimators': 200} 4.686965799321414e-06 0.9885314638413071 0.0021649401375838117
model_dict['RFR'] = RFR_dict
GBR_dict = {}
for k, v in train_test_set.items():
GBR_dict[k] = train_eval_model(GradientBoostingRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, GBR_dict[k]['mse'], GBR_dict[k]['r2'], GBR_dict[k]['rmse'])
CO2 8633.964127967609 0.9791696868923833 92.9191268144918 NOx 0.12515931403230093 0.9614113740154997 0.3537786229159429 PM10_Brake 0.0007463079898305557 0.848896970679011 0.02731863814011518 PM10_Exhaust 5.2317562364365925e-05 0.9738300486978162 0.007233088024099107 PM10_Resusp 0.00028458823959737835 0.9851159767998757 0.016869743317471618 PM10_Tyre 1.6312103470141583e-05 0.9802825767867993 0.004038824515888452 PM25_Brake 0.00012991753206301817 0.8398181871643908 0.011398137218994082 PM25_Exhaust 3.974716875618719e-05 0.9713102850689331 0.006304535570221425 PM25_Resusp 3.9775022749421016e-07 0.9847860931039734 0.0006306744227366528 PM25_Tyre 8.118895350512688e-06 0.9801338757561492 0.0028493675351756024
# grid search for best parameters for GradientBoostingRegressor
param_grid = {'n_estimators': [120, 150, 200],
'max_depth': [4, 6]}
for k, v in train_test_set.items():
GBR_dict = hyperparameter_tuning(GradientBoostingRegressor(random_state=42),
param_grid,
pollutant=k,
model_dict=GBR_dict,
x_train=v['x_train'],
y_train=v['y_train'],
x_test=v['x_test'],
y_test=v['y_test'])
print(k, GBR_dict[k]['best_params'], GBR_dict[k]['best_mse'], GBR_dict[k]['best_r2'], GBR_dict[k]['best_rmse'])
model_dict['GBR'] = GBR_dict
XGB_dict = {}
for k, v in train_test_set.items():
XGB_dict[k] = train_eval_model(XGBRegressor(random_state=42), v['x_train'], v['y_train'], v['x_test'], v['y_test'])
print(k, XGB_dict[k]['mse'], XGB_dict[k]['r2'], XGB_dict[k]['rmse'])
CO2 4248.047338888423 0.9897511554537981 65.1770461043489 NOx 0.07502231582260359 0.9768694155272984 0.27390201865375796 PM10_Brake 0.0006517468906207694 0.8680425094394434 0.025529333924346115 PM10_Exhaust 2.649124024145316e-05 0.9867487238372276 0.0051469641772071 PM10_Resusp 0.00012123851173245462 0.9936592010129184 0.011010836105058264 PM10_Tyre 8.80976727638107e-06 0.989351102994401 0.002968125212382569 PM25_Brake 0.00010614136977098437 0.8691330049394411 0.010302493376410603 PM25_Exhaust 2.2290516379026105e-05 0.9839105883364085 0.004721283340261004 PM25_Resusp 2.3633240852221214e-07 0.9909603087283639 0.000486140317729575 PM25_Tyre 4.43777905787662e-06 0.9891411988547224 0.0021066036784066953
# grid search for best parameters for XGBRegressor
param_grid = {'n_estimators': [120, 150, 200],
'max_depth': [6, 8]}
for k, v in train_test_set.items():
XGB_dict = hyperparameter_tuning(XGBRegressor(random_state=42),
param_grid,
pollutant=k,
model_dict=XGB_dict,
x_train=v['x_train'],
y_train=v['y_train'],
x_test=v['x_test'],
y_test=v['y_test'])
print(k, XGB_dict[k]['best_params'], XGB_dict[k]['best_mse'], XGB_dict[k]['best_r2'], XGB_dict[k]['best_rmse'])
CO2 {'max_depth': 6, 'n_estimators': 200} 3726.5828714324016 0.9910092413075973 61.045744089431835
NOx {'max_depth': 8, 'n_estimators': 200} 0.06234448170662684 0.9807781953314824 0.24968876968463527
PM10_Brake {'max_depth': 6, 'n_estimators': 120} 0.0006546082734722818 0.8674631727275199 0.025585313628569844
PM10_Exhaust {'max_depth': 6, 'n_estimators': 200} 2.433226087529787e-05 0.9878286744756233 0.004932774156121266
PM10_Resusp {'max_depth': 6, 'n_estimators': 200} 0.00010098858053788854 0.9947182765605491 0.0100493074655863
PM10_Tyre {'max_depth': 6, 'n_estimators': 200} 7.934530900636037e-06 0.9904090539854391 0.0028168299381815787
PM25_Brake {'max_depth': 6, 'n_estimators': 120} 0.00010611452254087318 0.869166106230087 0.010301190345822817
PM25_Exhaust {'max_depth': 6, 'n_estimators': 200} 2.0287439389856145e-05 0.9853564198157971 0.004504158011199889
PM25_Resusp {'max_depth': 8, 'n_estimators': 120} 2.227621662549712e-07 0.9914793691540763 0.0004719768704660973
PM25_Tyre {'max_depth': 6, 'n_estimators': 200} 3.982623367291088e-06 0.9902549192697659 0.001995651113619585
model_dict['XGB'] = XGB_dict
dict_items([('CO2', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 4248.047338888423, 'rmse': 65.1770461043489, 'r2': 0.9897511554537981, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 3726.5828714324016, 'best_rmse': 61.045744089431835, 'best_r2': 0.9910092413075973, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9876430626771558}), ('NOx', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 0.07502231582260359, 'rmse': 0.27390201865375796, 'r2': 0.9768694155272984, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=8, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 0.06234448170662684, 'best_rmse': 0.24968876968463527, 'best_r2': 0.9807781953314824, 'best_params': {'max_depth': 8, 'n_estimators': 200}, 'best_score': 0.9781044540566057}), ('PM10_Brake', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 0.0006517468906207694, 'rmse': 0.025529333924346115, 'r2': 0.8680425094394434, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=120, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 0.0006546082734722818, 'best_rmse': 0.025585313628569844, 'best_r2': 0.8674631727275199, 'best_params': {'max_depth': 6, 'n_estimators': 120}, 'best_score': 0.841849838341607}), ('PM10_Exhaust', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 2.649124024145316e-05, 'rmse': 0.0051469641772071, 'r2': 0.9867487238372276, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 2.433226087529787e-05, 'best_rmse': 0.004932774156121266, 'best_r2': 0.9878286744756233, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9868177732475024}), ('PM10_Resusp', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 0.00012123851173245462, 'rmse': 0.011010836105058264, 'r2': 0.9936592010129184, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 0.00010098858053788854, 'best_rmse': 0.0100493074655863, 'best_r2': 0.9947182765605491, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9921829846268164}), ('PM10_Tyre', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 8.80976727638107e-06, 'rmse': 0.002968125212382569, 'r2': 0.989351102994401, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 7.934530900636037e-06, 'best_rmse': 0.0028168299381815787, 'best_r2': 0.9904090539854391, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9875156548266943}), ('PM25_Brake', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 0.00010614136977098437, 'rmse': 0.010302493376410603, 'r2': 0.8691330049394411, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=120, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 0.00010611452254087318, 'best_rmse': 0.010301190345822817, 'best_r2': 0.869166106230087, 'best_params': {'max_depth': 6, 'n_estimators': 120}, 'best_score': 0.8385838354104402}), ('PM25_Exhaust', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 2.2290516379026105e-05, 'rmse': 0.004721283340261004, 'r2': 0.9839105883364085, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 2.0287439389856145e-05, 'best_rmse': 0.004504158011199889, 'best_r2': 0.9853564198157971, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.9869454283964716}), ('PM25_Resusp', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 2.3633240852221214e-07, 'rmse': 0.000486140317729575, 'r2': 0.9909603087283639, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=8, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=120, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 2.227621662549712e-07, 'best_rmse': 0.0004719768704660973, 'best_r2': 0.9914793691540763, 'best_params': {'max_depth': 8, 'n_estimators': 120}, 'best_score': 0.988481709992198}), ('PM25_Tyre', {'model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'mse': 4.43777905787662e-06, 'rmse': 0.0021066036784066953, 'r2': 0.9891411988547224, 'best_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=6, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=200, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=42, ...), 'best_mse': 3.982623367291088e-06, 'best_rmse': 0.001995651113619585, 'best_r2': 0.9902549192697659, 'best_params': {'max_depth': 6, 'n_estimators': 200}, 'best_score': 0.986861243187229})])
if not os.path.exists('pickles'):
os.makedirs('pickles')
if not os.path.exists('metrics'):
os.makedirs('metrics')
# Save the model_dict to a pickle file
with open('pickles/model_dict.pkl', 'wb') as f:
pickle.dump(model_dict, f)
# consolidate the metrics for each model from model_dict to a csv file
df_metrics = pd.DataFrame()
columns_list = ['model_name', 'pollutant', 'mse', 'rmse', 'r2', 'best_mse', 'best_rmse', 'best_r2', 'best_params']
for k, v in model_dict.items():
for k1, v1 in v.items():
if 'best_model' in v1.keys():
row = [k, k1, v1['mse'], v1['rmse'], v1['r2'],
v1['best_mse'], v1['best_rmse'], v1['best_r2'], v1['best_params']]
else:
row = [k, k1, v1['mse'], v1['rmse'], v1['r2'], np.nan, np.nan, np.nan, np.nan]
df_metrics = pd.concat([df_metrics, pd.DataFrame([row], columns=columns_list)], axis=0)
df_metrics.to_csv('metrics/consolidated_metrics.csv')
# if df_selected_features is not initialized, load from features folder
try:
df_selected_features
except NameError:
df_selected_features = pd.read_csv('features/selected_features.csv')
df_selected_features.head()
# if model_dict is not initialized, load from models folder
try:
model_dict
except NameError:
with open('pickles/model_dict.pkl', 'rb') as f:
model_dict = pickle.load(f)
# if df_metrics is not initialized, load from metrics folder
try:
df_metrics
except NameError:
df_metrics = pd.read_csv('metrics/consolidated_metrics.csv')
df_metrics.head()